#Import all the necessary modules
import pandas as pd
import numpy as np
import os
import warnings
import seaborn as sns
warnings.filterwarnings('ignore')
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report,roc_auc_score
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
vehicle_df = pd.read_csv("vehicle-2.csv")
vehicle_df.shape
vehicle_df.head()
class_le = LabelEncoder()
columns = vehicle_df.columns
vehicle_df['class'] = class_le.fit_transform(vehicle_df['class'])
sns.countplot(new_vehicle_df['class'])
plt.show()
vehicle_df.info()
print("Original null value count:", vehicle_df.isnull().sum())
new_vehicle_df = vehicle_df.copy()
X = new_vehicle_df.iloc[:,0:19] #separating all numerical independent attribute
imputer = SimpleImputer(missing_values=np.nan, strategy='median', verbose=1)
#fill missing values with median column values
transformed_values = imputer.fit_transform(X)
column = X.columns
new_vehicle_df = pd.DataFrame(transformed_values, columns = column )
new_vehicle_df.describe()
print("\n\nCount after we imputed the NaN value: ", new_vehicle_df.isnull().sum())
# Check for duplicate data
dups = new_vehicle_df.duplicated()
print('Number of duplicate rows = %d' % (dups.sum()))
new_vehicle_df[dups]
plt.style.use('fivethirtyeight')
new_vehicle_df.hist(bins=20, figsize=(60,40), color='lightblue', edgecolor = 'orange')
plt.show()
skewValue = new_vehicle_df.skew()
print("skewValue of dataframe attributes: ", skewValue)
#Lets check to see if the data have outliers
plt.figure(figsize= (10,10))
ax = sns.boxplot(data=new_vehicle_df, orient="horizontal")
### Lets see which attributes have outliers
from scipy.stats import iqr
Qtr1 = new_vehicle_df.quantile(0.25)
Qtr3 = new_vehicle_df.quantile(0.75)
IQR = Qtr3 - Qtr1
print(IQR)
clean_df = new_vehicle_df[~((new_vehicle_df < (Q1 - 1.5 * IQR)) |(new_vehicle_df > (Q3 + 1.5 * IQR))).any(axis=1)]
#Lets check to see if the data have outliers
plt.figure(figsize= (10,10))
ax = sns.boxplot(data=clean_df, orient="horizontal")
clean_df= new_vehicle_df.drop('class', axis=1)
correlation = cleandf.corr()
plt.figure(figsize=(30,15))
sns.heatmap(correlation, vmax=1, square=True,annot=True,cmap='YlGnBu')
plt.title('Correlation between different fearures')
plt.show();
# a) Circularity is Strongly co-related to max.length_rectangularity and scaled_radius_of_gyration
# b) distance_circularity is Stronglt co-related to scatter_ration and pr_axis_aspect_ratio and scaled_variance and scaled_variance1
# c) It also is negatively co-related to elongatedness
# d) In Fact elongatedness is neatively co-related to several parameters like compactness, circulariy, radius ratio, distance_circularity, scater ratio, pr.axis_rectangularity, scaled_variance etc.
# e) Scaled variance and Scaled Variance.1 is strongly co-related
# f) Scatter ratio and pr.axis_aspect_ratio, scaled_variance, scaled_variance1 is also co-related to several parameters like compactness, circularity, distance_circularity, radius_ratio, scaled_radius_of_gyration
# g) skewness_about2, hollow_ratio have strong negative co-relation with scaled_radius_of_gyration.1
# a) max_length_aspect_ratio is having average or low co-relation with most of the parameters except distance_circularity
# b) pr.axis_aspect_ratio seems to have very little correlation with most of the parameters
# c) scaled_radius_gyration.1 seems to be very little correlated with most of the parameters
# d) skewness_about & skewness_about.1 and skewness_about.2 have very little co-relation with most of parameters except perhaps hollows_ratio
sns.pairplot(clean_df, diag_kind="kde")
## The above pairplot also shows visually the different parameters that have strong and weak co-relations as stated in above observations
## Scaled Variance & Scaled Variance.1 are strongly co-related, so one of them can be dropped
X = new_vehicle_df.iloc[:,0:18].values
y = new_vehicle_df.iloc[:,18].values
### We will do PCA according to the following set
### 1) Split our data into train and test data set
### 2) Normalize the training set using standard scalar
### 3) Calculate the covariance matrix.
### 4) Calculate the eigenvectors and their eigenvalues.
### 5) Sort the eigenvectors according to their eigenvalues in descending order.
### 6) Choose the first K eigenvectors (where k is the dimension we’d like to end up with).
### 7) Build new dataset with reduced dimensionality.
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_std = sc.fit_transform(X)
cov_matrix = np.cov(X_std.T)
print("cov_matrix shape:",cov_matrix.shape)
print("Covariance_matrix",cov_matrix)
### Calculating Eigen Vectors & Eigen Values: Using numpy linear algebra function
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
print('Eigen Vectors \n%s', eigenvectors)
print('\n Eigen Values \n%s', eigenvalues)
# Make a set of (eigenvalue, eigenvector) pairs:
eig_pairs = [(eigenvalues[index], eigenvectors[:,index]) for index in range(len(eigenvalues))]
# Sort the (eigenvalue, eigenvector) pairs from highest to lowest with respect to eigenvalue
eig_pairs.sort()
eig_pairs.reverse()
# Extract the descending ordered eigenvalues and eigenvectors
eigvalues_sorted = [eig_pairs[index][0] for index in range(len(eigenvalues))]
eigvectors_sorted = [eig_pairs[index][1] for index in range(len(eigenvalues))]
# Let's confirm our sorting worked, print out eigenvalues
print('Eigenvalues in descending order: \n%s' %eigvalues_sorted)
tot = sum(eigenvalues)
var_explained = [(i / tot) for i in sorted(eigenvalues, reverse=True)] # an array of variance explained by each
# eigen vector... there will be 18 entries as there are 18 eigen vectors)
cum_var_exp = np.cumsum(var_explained) # an array of cumulative variance. There will be 18 entries with 18 th entry
# cumulative reaching almost 100%
plt.bar(range(1,19), var_explained, alpha=0.5, align='center', label='individual explained variance')
plt.step(range(1,19),cum_var_exp, where= 'mid', label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc = 'best')
plt.show()
# P_reduce represents reduced mathematical space....
P_reduce = np.array(eigvectors_sorted[0:8]) # Reducing from 8 to 4 dimension space
X_std_8D = np.dot(X_std,P_reduce.T) # projecting original data into principal component dimensions
reduced_pca = pd.DataFrame(X_std_8D) # converting array to dataframe for pairplot
sns.pairplot(reduced_pca, diag_kind='kde')
#now split the data into 70:30 ratio
#orginal Data
X_train, X_test,y_train,y_test = train_test_split(X_std,y,test_size=0.30,random_state=1)
#PCA Data
pca_X_train,pca_X_test,pca_y_train,pca_y_test = train_test_split(reduced_pca,y,test_size=0.30,random_state=1)
svc = SVC() #instantiate the object
#fit the model on orighinal raw data
svc.fit(X_train,y_train)
#predict the y value
y_predict = svc.predict(X_test)
#now fit the model on pca data with new dimension
svc1 = SVC() #instantiate the object
svc1.fit(pca_X_train,pca_y_train)
#predict the y value
pca_y_predict = svc1.predict(pca_X_test)
#display accuracy score of both models
print("Model Score On Original Data ",svc.score(Orig_X_test, Orig_y_test))
print("Model Score On Reduced PCA Dimension ",svc1.score(pca_X_test, pca_y_test))
print("Before PCA On Original 18 Dimension",accuracy_score(Orig_y_test,Orig_y_predict))
print("After PCA(On 8 dimension)",accuracy_score(pca_y_test,pca_y_predict))
# Calculate Confusion Matrix & PLot To Visualize it
def draw_confmatrix(y_test, yhat, str1, str2, str3, datatype ):
#Make predictions and evaluate
#model_pred = fit_test_model(model,X_train, y_train, X_test)
cm = confusion_matrix( y_test, yhat, [0,1,2] )
print("Confusion Matrix For : \n",datatype,cm )
sns.heatmap(cm, annot=True, fmt='.2f', xticklabels = [str1, str2,str3] , yticklabels = [str1, str2,str3] )
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
draw_confmatrix(Orig_y_test, Orig_y_predict,"Van ", "Car ", "Bus", "Original Data Set\n" )
draw_confmatrix(pca_y_test, pca_y_predict,"Van ", "Car ", "Bus", "For Reduced Dimensions Using PCA \n")
#Classification Report Of Model built on Raw Data
print("Classification Report For Raw Data:", "\n", classification_report(Orig_y_test,Orig_y_predict))
#Classification Report Of Model built on Principal Components:
print("Classification Report For PCA:","\n", classification_report(pca_y_test,pca_y_predict))
import itertools
def classifiers_hypertune(name,rf,param_grid,x_train_scaled,y_train,x_test_scaled,y_test,CV):
CV_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=CV, verbose= 1, n_jobs =-1 )
CV_rf.fit(x_train_scaled, y_train)
y_pred_train = CV_rf.predict(x_train_scaled)
y_pred_test = CV_rf.predict(x_test_scaled)
print('Best Score: ', CV_rf.best_score_)
print('Best Params: ', CV_rf.best_params_)
#Classification Report
print(name+" Classification Report: ")
print(classification_report(y_test, y_pred_test))
#Confusion Matrix for test data
draw_confmatrix(y_test, y_pred_test,"Van", "Car", "Bus", "Original Data Set" )
print("SVM Accuracy Score:",round(accuracy_score(y_test, y_pred_test),2)*100)
#Training on SVM Classifier
from sklearn.model_selection import GridSearchCV
svmc = SVC()
#Let's See What all parameters one can tweak
print("SVM Parameters:", svmc.get_params())
# Create the parameter grid based on the results of random search
param_grid = [
{'C': [0.01, 0.05, 0.5, 1], 'kernel': ['linear']},
{'C': [0.01, 0.05, 0.5, 1], 'kernel': ['rbf']},
]
param_grid_1 = [
{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]
classifiers_hypertune("Support Vector Classifier", svmc, param_grid,X_train_std_pca, SplitScale_y_train, X_test_std_pca, SplitScale_y_test,10)
classifiers_hypertune("Support Vector Classifier", svmc, param_grid,ssx_train_sd, SplitScale_y_train, ssx_test_sd, SplitScale_y_test,10)
classifiers_hypertune("Support Vector Classifier_iterarion2", svmc, param_grid_1,X_train_std_pca, SplitScale_y_train, X_test_std_pca, SplitScale_y_test,10)
classifiers_hypertune("Support Vector Classifier", svmc, param_grid_1,ssx_train_sd, SplitScale_y_train, ssx_test_sd, SplitScale_y_test,10)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(ssx_train_sd, SplitScale_y_train)
print ('Before PCA score', model.score(ssx_test_sd, SplitScale_y_test))
model.fit(X_train_std_pca, SplitScale_y_train)
print ('After PCA score', model.score(X_test_std_pca, SplitScale_y_test))
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(ssx_train_sd, SplitScale_y_train)
print ('Before PCA score', nb.score(ssx_test_sd, SplitScale_y_test))
nb.fit(X_train_std_pca, SplitScale_y_train)
print ('After PCA score', nb.score(X_test_std_pca, SplitScale_y_test))
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(criterion = 'entropy' )
dt_model.fit(ssx_train_sd, SplitScale_y_train)
print ('Before PCA score', dt_model.score(ssx_test_sd, SplitScale_y_test))
dt_model.fit(X_train_std_pca, SplitScale_y_train)
print ('After PCA score', dt_model.score(X_test_std_pca, SplitScale_y_test))